Data Vizualization

Setup

Load library

# SOURCE: https://cedricscherer.netlify.app/2019/05/17/the-evolution-of-a-ggplot-ep.-1/#aim 

# Packages
required_packages <- c(
                       "tidyverse",   
                       "rgdal",
                       "RColorBrewer",
                       "leaflet",
                       "rgeos",
                       "openxlsx",
                       "ggthemes",
                       "tidyverse"
                       )      

for (pkg in required_packages) {
  # install packages if not already present
  if (!pkg %in% rownames(installed.packages())) {
    install.packages(pkg)
  }
  
  # load packages to this current session 
  library(pkg, character.only = TRUE)
}

Load datasets

library(tidyverse)

df_students <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-07/student_teacher_ratio.csv")

df_world_tile <- readr::read_csv("https://gist.githubusercontent.com/maartenzam/787498bbc07ae06b637447dbd430ea0a/raw/9a9dafafb44d8990f85243a9c7ca349acd3a0d07/worldtilegrid.csv") %>% 
  mutate(
    ## Namibias two-digit country code is handled as `NA` - let us fix that
    alpha.2 = if_else(name == "Namibia", "NA", alpha.2),
    ## We are going to split "Americas" into "North America" and "Sout America"
    region = if_else(region == "Americas", sub.region, region),
    region = if_else(region %in% c("Northern America", "Central America", "Caribbean"), 
                     "North America", region),
    region = if_else(region == "Southern America", "South America", region),
    ## to join both data sets, we need a id column
    country_code = alpha.3
  )

df_ratios <- df_students %>% 
  ## Let's keep only the most recent data per country
  group_by(country, indicator) %>% 
  filter(year == max(year)) %>% 
  ungroup() %>% 
  # Create `NA`s for countries which do not have any data 2012-2018
  complete(indicator, nesting(country, country_code)) %>% 
  ## Let's focus on primary education and keep only countries (coded by letters)
  filter(
    indicator == "Primary Education",
    str_detect(country_code, "[A-Z]")
  ) %>% 
  ## merge with world tile map data
  full_join(df_world_tile) %>%
  filter(
    !is.na(region),
    !is.na(indicator)
  ) %>% 
  group_by(region) %>% 
  mutate(student_ratio_region = median(student_ratio, na.rm = T)) %>% 
  ungroup()

First plots

p1<- ggplot(df_ratios, aes(x = region, y = student_ratio)) +
  geom_boxplot()
p1

# Sort the data
df_sorted <-
  df_ratios %>%
  mutate(region = fct_reorder(region, -student_ratio_region))


p2<- ggplot(df_sorted, aes(x = region, y = student_ratio)) +
  geom_boxplot()
p2

p2+ labs(title = "Student to teacher ratio",
         subtitle = "Lastest available data",
         y = "Student to teacher ratio",
         x = "Region", 
         caption = "Source: UNESCO")

p3<- ggplot(df_sorted, aes(x = region, y = student_ratio)) +
  geom_boxplot() +
  coord_flip() +
  scale_y_continuous(limits = c(0, 90)) +
  labs(title = "Student to teacher ratio",
         subtitle = "Lastest available data",
         y = "Student to teacher ratio",
         x = "Region", 
         caption = "Source: UNESCO")

p3

Change default plot settings

 tema_enmisp<- theme(panel.grid.minor = element_blank(),
        panel.grid.major.y = element_blank(),
        legend.title=element_blank(),
        axis.text = element_text(face = "bold", size = 8, color = "black"),
        axis.title = element_text(size = 10),
        plot.title = element_text(face = "bold", size = 12),
        legend.position = "none",
        plot.caption = element_text(size = 8, face = "italic")
        )

p4<- ggplot(df_sorted, aes(x = region, y = student_ratio, color = region)) +
  geom_jitter(size = 2, alpha = 0.25, width = 0.2) +
  stat_summary(fun = mean, geom = "point", size = 5) +
  #geom_boxplot() +
  coord_flip() +
  scale_y_continuous(limits = c(0, 90)) +
  labs(title = "Student to teacher ratio",
         subtitle = "Lastest available data",
         y = "Student to teacher ratio",
         x = "Region", 
         caption = "Source: UNESCO") +
  theme_minimal() +
  tema_enmisp



p4 

world_avg <-
  df_ratios %>%
  summarize(avg = mean(student_ratio, na.rm = TRUE)) %>%
  pull(avg)

p5 <- p4 +
  geom_segment(
    aes(x = region, xend = region,
        y = world_avg, yend = student_ratio_region),
    size = 0.8
  ) +
  geom_hline(aes(yintercept = world_avg), color = "gray70", size = 0.6) +
  geom_jitter(size = 2, alpha = 0.25, width = 0.2) +
  stat_summary(fun = mean, geom = "point", size = 5) 

p5

## Final Plot

## coordinates for arrows
arrows <-
  tibble(
    x1 = c(6, 3.65, 1.8, 1.8, 1.8),
    x2 = c(5.6, 4, 2.18, 2.76, 0.9),
    y1 = c(world_avg + 6, 10.5, 9, 9, 77),
    y2 = c(world_avg + 0.1, 18.4, 14.16, 12, 83.42)
  )

p5 +
    annotate(
      "text", x = 6.3, y = 35, 
      size = 2.7, color = "gray20",
      label = glue::glue("Worldwide average:\n{round(world_avg, 1)} students per teacher")
    ) +
    annotate(
      "text", x = 3.5, y = 10,
      size = 2.7, color = "gray20",
      label = "Continental average"
    ) +
    annotate(
      "text", x = 1.7, y = 11, 
      size = 2.7, color = "gray20",
      label = "Countries per continent"
    ) +
    annotate(
      "text", x = 1.9, y = 64, 
      size = 2.7, color = "gray20",
      label = "The Central African Republic has by far\nthe most students per teacher"
    ) +
    geom_curve(
      data = arrows, aes(x = x1, xend = x2,
                         y = y1, yend = y2),
      arrow = arrow(length = unit(0.08, "inch")), size = 0.5,
      color = "gray20", curvature = -0.3#
    )  

Save plot

ggsave( filename = here::here("figures", 
                  paste0(Sys.Date(), "_student_ratio", ".png")),
                  width = 20.49,
                  height = 10.3,
                  units = c("cm"),
                  dpi = "retina")

Line plots

sns_epe <- readRDS("datasets/sns_epe.rds")
sns_sum_m <- readRDS("datasets/sns_sum_m.rds")

p6 <- ggplot(data = sns_epe,
            mapping = aes(x = date,
                          y = arrears_sum,
                          colour = year)) +
  
            geom_point(alpha = 0.5) +
  
  geom_smooth(method = loess, 
              se = FALSE) +
  
  scale_x_date(date_breaks = "1 year", date_labels =  "%Y") +
  
  geom_vline(xintercept=as.numeric(as.Date("2015-11-01")), colour="red") + 
  
  geom_text(aes(x=as.Date("2015-09-01"), 
                label="XXI Governo",y=700), 
            colour="black", 
            angle=90, 
            vjust = 1.2,
            size = 2) +
  
  geom_vline(xintercept=as.numeric(as.Date("2019-10-01")), colour="red") + 
  
  geom_text(aes(x=as.Date("2019-08-01"), 
                label="XXII Governo",y=700), 
            colour="black", 
            angle=90, 
            vjust = 1.2,
            size = 2)+ 
  labs(title = "Dívidas em atraso  dos Hospitais EPE",
           subtitle = "2014 - 2020",
           color='Ano (linear)',
           y = "Dívidas em milhões Euros",
           x = "Ano")  + 
  theme(plot.title = element_text(hjust = 0.5), 
        plot.subtitle = element_text(hjust = 0.5)) +
theme_minimal() +
tema_enmisp

p6


sns_sum_m$colour <- ifelse(sns_sum_m$arrear_m_var1 <= 0, "Negative","Positive")
sns_sum_m$hjust <- ifelse(sns_sum_m$arrear_m_var1 > 0, 1.3, -0.3)

p7 <- ggplot(data = sns_sum_m,
            mapping = aes(x = date,
                          y = arrear_m_var1,
                          label="",
                          hjust=hjust,
                          na.rm = TRUE)) +
            geom_bar(stat="identity",
                     position="identity",
                     aes(fill = colour))  + 
#  geom_text(aes(y=0,colour=colour)) +
  scale_fill_manual(values=c(Positive="#00BFC4",
                             Negative="#F8766D")) +
  scale_x_date(date_breaks = "1 year", date_labels =  "%Y") +
  labs(subtitle = "Variação mensal das dívidas ",
           y = "Variação mensal",
           x = "Ano",
           caption = "Source: Portal da Transparencia SNS",
           fill = "Arrears Var")  + 
  theme(plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5)) +
theme_minimal() +
tema_enmisp

p7

Patchwork

#install.packages("patchwork")
library(patchwork)

p8<- p6 / p7 

p8

Save plot

ggsave( filename = here::here("figures", 
                  paste0(Sys.Date(), "_hospital_debt", ".png")),
                  width = 20.49,
                  height = 10.3,
                  units = c("cm"),
                  dpi = "retina")

Create a Map




comm_pt <- readOGR(
  dsn= "datasets/concelhos-shapefile" , ###
  layer="concelhos",
  verbose=FALSE,
  use_iconv = TRUE,   ###  
  encoding = "UTF-8"  ### 
)

# make the polygons a bit less verbose
comm_pt1 <- gSimplify(comm_pt, 0.01, topologyPreserve=TRUE)
comm_pt1 = SpatialPolygonsDataFrame(comm_pt1, data=comm_pt@data)

comm_pt1@data <- comm_pt1@data %>%
  dplyr::select(
    "ISO",
    "ID_1",
    "NAME_1",
    "ID_2",
    "NAME_2",
  )

covid_inc <- read_csv("datasets/covid_inc.csv") 


covid_inc <- covid_inc %>%
  rename(NAME_2=Concelho) %>%
  mutate( NAME_2=recode(NAME_2,
                         'Ponte de Sor' = "Ponte de Sôr"
                         ))


comm_pt1@data<-left_join(comm_pt1@data,
                     covid_inc,
                     by = "NAME_2")



#comm_pt1@data<- comm_pt1@data %>%
#  filter(
#    !ARS %in% c("Açores", "Madeira")
#  )
mybins <- c(0,60,120,240,480,Inf)
mycolors <- c("#f7f7f7","#fee391","#fb6a4a","#cb181d","#67000d")
mypalette <- colorBin( palette=mycolors, domain=comm_pt1@data$Incidencia, na.color="transparent", bins=mybins)

# Prepare the text for tooltips:
mytext <- paste(
    "Concelho: ", comm_pt1@data$NAME_2,"<br/>",
    "Incidência: ", comm_pt1@data$Incidencia, "<br/>",
    "Categoria: ", comm_pt1@data$Incidencia_descritivo,
    sep="") %>%
  lapply(htmltools::HTML)

# Final Map
leaflet(comm_pt1) %>%
  addTiles()  %>%
  setView( lat=39.5, lng=-7.6, zoom=7) %>%
  addProviderTiles("CartoDB.Positron") %>%
  addPolygons(
    fillColor = ~mypalette(Incidencia),
    stroke=TRUE,
    fillOpacity = 0.9,
    color="grey",
    weight=0.3,
    label = mytext,
    labelOptions = labelOptions(
      style = list("font-weight" = "normal", padding = "3px 8px"),
      textsize = "13px",
      direction = "auto"
    )
  ) %>%
  addLegend( pal=mypalette, values=~Incidencia, opacity=0.9, title = "Incidência", position = "bottomleft" )
library(viridis)

# I need to fortify the data AND keep trace of the commune code! (Takes ~2 minutes)
library(broom)
comm_pt1_fortified <- tidy(comm_pt1, region = "NAME_2")

ggplot() +
  geom_polygon(data=comm_pt1_fortified,
               aes(x=long,
                   y=lat,),
               color="white",
               size=.2) +
  theme_void() +
  coord_map() +
  scale_fill_binned(type = "viridis", direction=-1) +
  labs(title="Population in New York City",
       subtitle="Neighborhoods are filled by population",
       fill="Population")